import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("risk_data.csv") # Load data
df.shape
(5042, 122)
df["DAYS_EMPLOYED"].describe()
count 5042.000000 mean -2395.756446 std 2397.824075 min -16121.000000 25% -3184.750000 50% -1649.000000 75% -746.000000 max -9.000000 Name: DAYS_EMPLOYED, dtype: float64
365243 /365
1000.6657534246575
df.isna().sum()
SK_ID_CURR 0
TARGET 0
NAME_CONTRACT_TYPE 0
CODE_GENDER 0
FLAG_OWN_CAR 0
...
AMT_REQ_CREDIT_BUREAU_DAY 666
AMT_REQ_CREDIT_BUREAU_WEEK 666
AMT_REQ_CREDIT_BUREAU_MON 666
AMT_REQ_CREDIT_BUREAU_QRT 666
AMT_REQ_CREDIT_BUREAU_YEAR 666
Length: 122, dtype: int64
# Drop Unnecessary data
drop = ["SK_ID_CURR", "OWN_CAR_AGE", "FLAG_EMP_PHONE", "FLAG_WORK_PHONE",
"FLAG_CONT_MOBILE", "FLAG_PHONE", "EXT_SOURCE_1", "EXT_SOURCE_2",
"EXT_SOURCE_3", "BASEMENTAREA_AVG", "YEARS_BEGINEXPLUATATION_AVG",
"YEARS_BUILD_AVG","COMMONAREA_AVG", "ELEVATORS_AVG", "ENTRANCES_AVG",
"FLOORSMAX_AVG", "FLOORSMIN_AVG", "LANDAREA_AVG","LIVINGAPARTMENTS_AVG",
"LIVINGAREA_AVG", "NONLIVINGAPARTMENTS_AVG","NONLIVINGAREA_AVG","APARTMENTS_MODE",
"BASEMENTAREA_MODE",'YEARS_BEGINEXPLUATATION_MODE',"YEARS_BUILD_MODE","COMMONAREA_MODE",
"ELEVATORS_MODE","ENTRANCES_MODE","FLOORSMAX_MODE","FLOORSMIN_MODE","LANDAREA_MODE",
'LIVINGAPARTMENTS_MODE', "LIVINGAREA_MODE","NONLIVINGAPARTMENTS_MODE","NONLIVINGAREA_MODE",
"APARTMENTS_MEDI","BASEMENTAREA_MEDI","YEARS_BEGINEXPLUATATION_MEDI","YEARS_BUILD_MEDI",
"COMMONAREA_MEDI","ELEVATORS_MEDI","ENTRANCES_MEDI","FLOORSMAX_MEDI","FLOORSMIN_MEDI",
"LANDAREA_MEDI","LIVINGAPARTMENTS_MEDI","LIVINGAREA_MEDI","NONLIVINGAPARTMENTS_MEDI","NONLIVINGAREA_MEDI",
"TOTALAREA_MODE","OBS_30_CNT_SOCIAL_CIRCLE","DEF_30_CNT_SOCIAL_CIRCLE","OBS_60_CNT_SOCIAL_CIRCLE",
"DEF_60_CNT_SOCIAL_CIRCLE","DAYS_LAST_PHONE_CHANGE","FLAG_DOCUMENT_2","FLAG_DOCUMENT_3","FLAG_DOCUMENT_4",
"FLAG_DOCUMENT_5","FLAG_DOCUMENT_6","FLAG_DOCUMENT_7","FLAG_DOCUMENT_8","FLAG_DOCUMENT_9","FLAG_DOCUMENT_10",
"FLAG_DOCUMENT_11","FLAG_DOCUMENT_12","FLAG_DOCUMENT_13","FLAG_DOCUMENT_14","FLAG_DOCUMENT_15",
"FLAG_DOCUMENT_16","FLAG_DOCUMENT_17","FLAG_DOCUMENT_18","FLAG_DOCUMENT_19","FLAG_DOCUMENT_20","FLAG_DOCUMENT_21",
"APARTMENTS_AVG", "FONDKAPREMONT_MODE", "HOUSETYPE_MODE", "WALLSMATERIAL_MODE", "EMERGENCYSTATE_MODE", "OCCUPATION_TYPE",
"FLAG_MOBIL", "FLAG_EMAIL"]
df.drop(drop, axis=1, inplace=True)
df.isna().sum()
df.dropna(inplace=True)
df.shape
(4363, 38)
df.columns
Index(['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'CNT_FAM_MEMBERS',
'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
'ORGANIZATION_TYPE', 'AMT_REQ_CREDIT_BUREAU_HOUR',
'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
'AMT_REQ_CREDIT_BUREAU_YEAR'],
dtype='object')
df.describe()
df.drop(["AMT_REQ_CREDIT_BUREAU_DAY","AMT_REQ_CREDIT_BUREAU_WEEK",
"AMT_REQ_CREDIT_BUREAU_HOUR","AMT_REQ_CREDIT_BUREAU_YEAR",
"AMT_REQ_CREDIT_BUREAU_QRT","AMT_REQ_CREDIT_BUREAU_MON"],axis=1, inplace=True)
drop = ["REG_REGION_NOT_LIVE_REGION",
"REG_REGION_NOT_WORK_REGION",
"LIVE_REGION_NOT_WORK_REGION",
"REG_CITY_NOT_LIVE_CITY",
"REG_CITY_NOT_WORK_CITY",
"LIVE_CITY_NOT_WORK_CITY"]
df.drop(drop, axis=1, inplace=True)
df.describe()
| TARGET | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4363.000000 | 4363.000000 | 4.363000e+03 | 4.363000e+03 | 4363.000000 | 4.363000e+03 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 |
| mean | 0.082970 | 0.471923 | 1.797927e+05 | 6.317434e+05 | 28244.090649 | 5.668660e+05 | 0.021071 | -15028.916571 | -2437.450607 | -4728.646115 | -2875.125602 | 2.224158 | 2.039881 | 2.021774 | 12.285354 |
| std | 0.275869 | 0.717776 | 1.006784e+05 | 4.168209e+05 | 14728.222054 | 3.821942e+05 | 0.013931 | 3646.057455 | 2409.073236 | 3300.520654 | 1491.215163 | 0.898158 | 0.516811 | 0.512327 | 3.302137 |
| min | 0.000000 | 0.000000 | 3.510000e+04 | 4.500000e+04 | 2673.000000 | 4.500000e+04 | 0.000533 | -25126.000000 | -15860.000000 | -18294.000000 | -6207.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 1.125000e+05 | 2.844000e+05 | 17286.750000 | 2.475000e+05 | 0.010006 | -17880.500000 | -3267.000000 | -7128.000000 | -4207.500000 | 2.000000 | 2.000000 | 2.000000 | 10.000000 |
| 50% | 0.000000 | 0.000000 | 1.575000e+05 | 5.400000e+05 | 26284.500000 | 4.545000e+05 | 0.018850 | -14864.000000 | -1686.000000 | -4298.000000 | -2995.000000 | 2.000000 | 2.000000 | 2.000000 | 12.000000 |
| 75% | 0.000000 | 1.000000 | 2.250000e+05 | 8.550000e+05 | 35964.000000 | 7.290000e+05 | 0.028663 | -12053.500000 | -766.000000 | -1944.000000 | -1629.000000 | 3.000000 | 2.000000 | 2.000000 | 15.000000 |
| max | 1.000000 | 4.000000 | 1.350000e+06 | 2.700000e+06 | 225000.000000 | 2.700000e+06 | 0.072508 | -7721.000000 | -9.000000 | -3.000000 | -1.000000 | 6.000000 | 3.000000 | 3.000000 | 23.000000 |
df.dtypes
TARGET int64 NAME_CONTRACT_TYPE object CODE_GENDER object FLAG_OWN_CAR object FLAG_OWN_REALTY object CNT_CHILDREN int64 AMT_INCOME_TOTAL float64 AMT_CREDIT float64 AMT_ANNUITY float64 AMT_GOODS_PRICE float64 NAME_TYPE_SUITE object NAME_INCOME_TYPE object NAME_EDUCATION_TYPE object NAME_FAMILY_STATUS object NAME_HOUSING_TYPE object REGION_POPULATION_RELATIVE float64 DAYS_BIRTH int64 DAYS_EMPLOYED int64 DAYS_REGISTRATION float64 DAYS_ID_PUBLISH int64 CNT_FAM_MEMBERS float64 REGION_RATING_CLIENT int64 REGION_RATING_CLIENT_W_CITY int64 WEEKDAY_APPR_PROCESS_START object HOUR_APPR_PROCESS_START int64 ORGANIZATION_TYPE object dtype: object
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_CREDIT")
ax = sns.distplot(df["AMT_CREDIT"])
/var/folders/sv/npxlc_k53696tn8hryg5dx5w0000gn/T/ipykernel_4688/2776091565.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 ax = sns.distplot(df["AMT_CREDIT"])
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_CREDIT")
ax = sns.distplot(np.log(df["AMT_CREDIT"]))
/var/folders/sv/npxlc_k53696tn8hryg5dx5w0000gn/T/ipykernel_4688/854655729.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 ax = sns.distplot(np.log(df["AMT_CREDIT"]))
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_INCOME_TOTAL")
ax = sns.distplot(df["AMT_INCOME_TOTAL"].dropna())
/var/folders/sv/npxlc_k53696tn8hryg5dx5w0000gn/T/ipykernel_4688/873008540.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 ax = sns.distplot(df["AMT_INCOME_TOTAL"].dropna())
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_GOODS_PRICE")
ax = sns.distplot(df["AMT_GOODS_PRICE"].dropna())
/var/folders/sv/npxlc_k53696tn8hryg5dx5w0000gn/T/ipykernel_4688/3108533166.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 ax = sns.distplot(df["AMT_GOODS_PRICE"].dropna())
Some of variables such as Region_rating_client_W_CITY has very bad distribution.
df.describe()
| TARGET | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4363.000000 | 4363.000000 | 4.363000e+03 | 4.363000e+03 | 4363.000000 | 4.363000e+03 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 |
| mean | 0.082970 | 0.471923 | 1.797927e+05 | 6.317434e+05 | 28244.090649 | 5.668660e+05 | 0.021071 | -15028.916571 | -2437.450607 | -4728.646115 | -2875.125602 | 2.224158 | 2.039881 | 2.021774 | 12.285354 |
| std | 0.275869 | 0.717776 | 1.006784e+05 | 4.168209e+05 | 14728.222054 | 3.821942e+05 | 0.013931 | 3646.057455 | 2409.073236 | 3300.520654 | 1491.215163 | 0.898158 | 0.516811 | 0.512327 | 3.302137 |
| min | 0.000000 | 0.000000 | 3.510000e+04 | 4.500000e+04 | 2673.000000 | 4.500000e+04 | 0.000533 | -25126.000000 | -15860.000000 | -18294.000000 | -6207.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 1.125000e+05 | 2.844000e+05 | 17286.750000 | 2.475000e+05 | 0.010006 | -17880.500000 | -3267.000000 | -7128.000000 | -4207.500000 | 2.000000 | 2.000000 | 2.000000 | 10.000000 |
| 50% | 0.000000 | 0.000000 | 1.575000e+05 | 5.400000e+05 | 26284.500000 | 4.545000e+05 | 0.018850 | -14864.000000 | -1686.000000 | -4298.000000 | -2995.000000 | 2.000000 | 2.000000 | 2.000000 | 12.000000 |
| 75% | 0.000000 | 1.000000 | 2.250000e+05 | 8.550000e+05 | 35964.000000 | 7.290000e+05 | 0.028663 | -12053.500000 | -766.000000 | -1944.000000 | -1629.000000 | 3.000000 | 2.000000 | 2.000000 | 15.000000 |
| max | 1.000000 | 4.000000 | 1.350000e+06 | 2.700000e+06 | 225000.000000 | 2.700000e+06 | 0.072508 | -7721.000000 | -9.000000 | -3.000000 | -1.000000 | 6.000000 | 3.000000 | 3.000000 | 23.000000 |
import plotly.express as px
num_vars = df.select_dtypes(include=['float', 'int']).columns.tolist()
# Plot each numerical variable
for var in num_vars:
fig = px.box(df, x=var)
fig.show()
import plotly.graph_objects as go
numeric_cols = df.select_dtypes(include=['float64', 'int64'])
traces = []
for col in numeric_cols.columns:
trace = go.Box(
y=numeric_cols[col],
name=col,
boxpoints='outliers'
)
traces.append(trace)
# Create the layout and plot the figure
layout = go.Layout(
title='Boxplot of Numeric Variables',
xaxis=dict(title='Value')
)
fig = go.Figure(data=traces, layout=layout)
fig.show()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_cols = scaler.fit_transform(numeric_cols)
scaled_df = pd.DataFrame(scaled_cols, columns=numeric_cols.columns)
# Create a list of traces for each column
traces = []
for col in scaled_df.columns:
trace = go.Box(
y=scaled_df[col],
name=col,
boxpoints='outliers'
)
traces.append(trace)
# Create the layout and plot the figure
layout = go.Layout(
title='Boxplot of Scaled Numeric Variables',
xaxis=dict(title='Scaled Value')
)
fig = go.Figure(data=traces, layout=layout)
fig.show()
from plotly import tools
from plotly.offline import init_notebook_mode, iplot
def bar_hor(df, col, title, color, w=None, h=None, lm=0, limit=100, return_trace=False, rev=False, xlb = False):
cnt_srs = df[col].value_counts()
yy = cnt_srs.head(limit).index[::-1]
xx = cnt_srs.head(limit).values[::-1]
if rev:
yy = cnt_srs.tail(limit).index[::-1]
xx = cnt_srs.tail(limit).values[::-1]
if xlb:
trace = go.Bar(y=xlb, x=xx, orientation = 'h', marker=dict(color=color))
else:
trace = go.Bar(y=yy, x=xx, orientation = 'h', marker=dict(color=color))
if return_trace:
return trace
layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
data = [trace]
fig = go.Figure(data=data, layout=layout)
return fig
bar_hor(df, "TARGET", "Distribution of Target Variable" ,
["#96D38C", '#FEBFB3'], h=350, w=600, lm=200, xlb = ['Target : 1','Target : 0'])
def gp(col, title):
df1 = df[df["TARGET"] == 1]
df0 = df[df["TARGET"] == 0]
a1 = df1[col].value_counts()
b1 = df0[col].value_counts()
total = dict(df[col].value_counts())
x0 = a1.index
x1 = b1.index
y0 = [float(x)*100 / total[x0[i]] for i,x in enumerate(a1.values)]
y1 = [float(x)*100 / total[x1[i]] for i,x in enumerate(b1.values)]
trace1 = go.Bar(x=a1.index, y=y0, name='Target : 1', marker=dict(color="#96D38C"))
trace2 = go.Bar(x=b1.index, y=y1, name='Target : 0', marker=dict(color="#FEBFB3"))
return trace1, trace2
tr0 = bar_hor(df, "CODE_GENDER", "Distribution of CODE_GENDER Variable" ,"#f975ae", w=700, lm=100, return_trace= True)
tr1, tr2 = gp('CODE_GENDER', 'Distribution of Target with Applicant Gender')
fig = tools.make_subplots(rows=1, cols=3, print_grid=False, subplot_titles = ["Gender Distribution" , "Gender, Target=1" ,"Gender, Target=0"])
fig.append_trace(tr0, 1, 1);
fig.append_trace(tr1, 1, 2);
fig.append_trace(tr2, 1, 3);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=50));
iplot(fig);
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning: plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead
Number of Females are much more than number of Men in the data
import plotly.graph_objects as go
temp = df["NAME_TYPE_SUITE"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
title = "Who accompanied client when applying for the application in % ",
xaxis=dict(
title='Name of type of the Suite',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count of Name of type of the Suite in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
fig
import plotly.graph_objects as go
temp = df["NAME_INCOME_TYPE"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
title = "Who accompanied client when applying for the application in % ",
xaxis=dict(
title='Name of type of the Suite',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count of Name of type of the Suite in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
fig
df["NAME_INCOME_TYPE"] = df["NAME_INCOME_TYPE"].map({"Commercial associate":"Businessman",
"Student":"Unemployed", "Maternity leave":"Unemployed",
"State servant":"State servent", "Working":"Working",
"Unemployed":"Unemployed","Pensioner":"Pensioner"})
df["NAME_TYPE_SUITE"] = df["NAME_TYPE_SUITE"].map({"Spouse, partner": "Others",
"Children":"Others","Other_B":"Others",
"Other_A":"Others","Group of people":"Others",
"Unaccompanied":"Unaccompanied","Family":"Family"})
df = df[df["NAME_INCOME_TYPE"] != "Unemployed"]
tr0 = bar_hor(df, "NAME_TYPE_SUITE", "Distribution of CODE_GENDER Variable" ,"#f975ae", w=700, lm=100, return_trace= True)
tr1 = bar_hor(df, "NAME_INCOME_TYPE", "Distribution of CODE_GENDER Variable" ,"#f975ae", w=700, lm=100, return_trace= True)
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles = ['Applicants Suite Type' , 'Applicants Income Type'])
fig.append_trace(tr0, 1, 1);
fig.append_trace(tr1, 1, 2);
fig['layout'].update(height=400, showlegend=False, margin=dict(l=100));
iplot(fig);
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning: plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead
tr1, tr2 = gp('NAME_TYPE_SUITE', 'Applicants Type Suites which repayed the loan')
fig = tools.make_subplots(rows=1, cols=2, print_grid=False,
subplot_titles = ["Applicants Type Suites distribution when Target = 1", "Applicants Type Suites distribution when Target = 0"])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr2, 1, 2);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=120));
iplot(fig);
tr1, tr2 = gp('NAME_INCOME_TYPE', 'Applicants Income Types which repayed the loan')
fig = tools.make_subplots(rows=1, cols=2, print_grid=False,
subplot_titles = ["Applicants Income Types when Target = 1", "Applicants Income Type When Target = 0"])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr2, 1, 2);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=120));
iplot(fig);
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning: plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning: plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead
temp = df["FLAG_OWN_CAR"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
title = "Who accompanied client when applying for the application in % ",
xaxis=dict(
title='Name of type of the Suite',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count of Name of type of the Suite in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
fig
tr1 = bar_hor(df, "NAME_EDUCATION_TYPE", "Distribution of " ,"#f975ae", w=700, lm=100, return_trace= True)
tr2 = bar_hor(df, "NAME_HOUSING_TYPE", "Distribution of " ,"#f975ae", w=700, lm=100, return_trace = True)
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles = ['Applicants Education Type', 'Applicants Housing Type' ])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr2, 1, 2);
fig['layout'].update(height=400,showlegend=False, margin=dict(l=100));
iplot(fig);
tr1, tr2 = gp('NAME_EDUCATION_TYPE', 'Applicants Income Types which repayed the loan')
tr3, tr4 = gp('NAME_HOUSING_TYPE', 'Applicants Income Types which repayed the loan')
fig = tools.make_subplots(rows=1, cols=2, print_grid=False,
subplot_titles = ["Applicants Education Types, Target=1", "Applicants Housing Type, Target=1"])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr3, 1, 2);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=30));
iplot(fig);
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning: plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning: plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead
tr1 = bar_hor(df, "NAME_FAMILY_STATUS", "Distribution of " ,"#f975ae", w=700, lm=100, return_trace= True)
tr2 = bar_hor(df, "FLAG_OWN_CAR", "Distribution of " ,"#f975ae", w=700, lm=100, return_trace = True)
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles = ['NAME_FAMILY_STATUS', 'FLAG OWN CAR' ])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr2, 1, 2);
fig['layout'].update(height=400,showlegend=False, margin=dict(l=100));
iplot(fig);
tr1, tr2 = gp('NAME_FAMILY_STATUS', 'FLAG_OWN_CAR which repayed the loan')
tr3, tr4 = gp('FLAG_OWN_CAR', 'FLAG_OWN_CAR which repayed the loan')
fig = tools.make_subplots(rows=1, cols=2, print_grid=False,
subplot_titles = ['NAME_FAMILY_STATUS, target = 1', 'FLAG OWN CAR. target = 1'])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr3, 1, 2);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=30));
iplot(fig);
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning: plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning: plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead
df.dtypes
TARGET int64 NAME_CONTRACT_TYPE object CODE_GENDER object FLAG_OWN_CAR object FLAG_OWN_REALTY object CNT_CHILDREN int64 AMT_INCOME_TOTAL float64 AMT_CREDIT float64 AMT_ANNUITY float64 AMT_GOODS_PRICE float64 NAME_TYPE_SUITE object NAME_INCOME_TYPE object NAME_EDUCATION_TYPE object NAME_FAMILY_STATUS object NAME_HOUSING_TYPE object REGION_POPULATION_RELATIVE float64 DAYS_BIRTH int64 DAYS_EMPLOYED int64 DAYS_REGISTRATION float64 DAYS_ID_PUBLISH int64 CNT_FAM_MEMBERS float64 REGION_RATING_CLIENT int64 REGION_RATING_CLIENT_W_CITY int64 WEEKDAY_APPR_PROCESS_START object HOUR_APPR_PROCESS_START int64 ORGANIZATION_TYPE object dtype: object
df["TARGET"] = df["TARGET"].astype(str)
from sklearn.preprocessing import MinMaxScaler
# select only numerical columns
num_cols = df.select_dtypes(include=['float',"int"]).columns
# create scaler object
scaler = MinMaxScaler()
# fit and transform data using scaler
scaled_data = scaler.fit_transform(df[num_cols])
# assign back to original column names
df.loc[:, num_cols] = scaled_data
df.columns
Index(['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'CNT_FAM_MEMBERS',
'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
'ORGANIZATION_TYPE'],
dtype='object')
df.describe()
| CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 | 4363.000000 |
| mean | 0.117981 | 0.110041 | 0.220996 | 0.115016 | 0.196560 | 0.285346 | 0.580125 | 0.846795 | 0.741641 | 0.536880 | 0.244832 | 0.519940 | 0.510887 | 0.534146 |
| std | 0.179444 | 0.076567 | 0.156995 | 0.066246 | 0.143953 | 0.193560 | 0.209483 | 0.151982 | 0.180445 | 0.240286 | 0.179632 | 0.258405 | 0.256163 | 0.143571 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.058864 | 0.090169 | 0.065731 | 0.076271 | 0.131615 | 0.416288 | 0.794461 | 0.610464 | 0.322188 | 0.200000 | 0.500000 | 0.500000 | 0.434783 |
| 50% | 0.000000 | 0.093087 | 0.186441 | 0.106202 | 0.154237 | 0.254491 | 0.589601 | 0.894202 | 0.765185 | 0.517564 | 0.200000 | 0.500000 | 0.500000 | 0.521739 |
| 75% | 0.250000 | 0.144422 | 0.305085 | 0.149739 | 0.257627 | 0.390830 | 0.751077 | 0.952243 | 0.893882 | 0.737673 | 0.400000 | 0.500000 | 0.500000 | 0.652174 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
from scipy.stats import boxcox
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_CREDIT")
ax = sns.distplot((df["AMT_CREDIT"]))
/var/folders/sv/npxlc_k53696tn8hryg5dx5w0000gn/T/ipykernel_4688/1739910397.py:5: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
from sklearn.preprocessing import PolynomialFeatures
df_num = df[num_cols]
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
crossed = poly.fit_transform(df_num)
# create feature names manually
feature_names = poly.get_feature_names_out(df_num.columns)
feature_names = [name.replace(' ', '*') for name in feature_names]
# print the crossed features and their names
df_crossing = pd.DataFrame(crossed, columns=feature_names)
df_crossing.head()
| CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | ... | DAYS_ID_PUBLISH*CNT_FAM_MEMBERS | DAYS_ID_PUBLISH*REGION_RATING_CLIENT | DAYS_ID_PUBLISH*REGION_RATING_CLIENT_W_CITY | DAYS_ID_PUBLISH*HOUR_APPR_PROCESS_START | CNT_FAM_MEMBERS*REGION_RATING_CLIENT | CNT_FAM_MEMBERS*REGION_RATING_CLIENT_W_CITY | CNT_FAM_MEMBERS*HOUR_APPR_PROCESS_START | REGION_RATING_CLIENT*REGION_RATING_CLIENT_W_CITY | REGION_RATING_CLIENT*HOUR_APPR_PROCESS_START | REGION_RATING_CLIENT_W_CITY*HOUR_APPR_PROCESS_START | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00 | 0.315537 | 0.375729 | 0.171052 | 0.322034 | 0.193553 | 0.490779 | 0.431077 | 0.760757 | 0.979053 | ... | 0.195811 | 0.489526 | 0.489526 | 0.297973 | 0.1 | 0.1 | 0.060870 | 0.25 | 0.152174 | 0.152174 |
| 1 | 0.50 | 0.058864 | 0.224136 | 0.128588 | 0.152542 | 0.489878 | 0.851422 | 0.799697 | 0.876169 | 0.517403 | ... | 0.310442 | 0.258701 | 0.258701 | 0.269949 | 0.3 | 0.3 | 0.313043 | 0.25 | 0.260870 | 0.260870 |
| 2 | 0.25 | 0.058864 | 0.101356 | 0.074404 | 0.067797 | 0.080195 | 0.960873 | 0.964608 | 0.850582 | 0.870770 | ... | 0.348308 | 0.870770 | 0.870770 | 0.189298 | 0.4 | 0.4 | 0.086957 | 1.00 | 0.217391 | 0.217391 |
| 3 | 0.50 | 0.058864 | 0.066873 | 0.068231 | 0.059322 | 0.076068 | 0.817639 | 0.883162 | 0.725603 | 0.882050 | ... | 0.529230 | 0.441025 | 0.441025 | 0.383500 | 0.3 | 0.3 | 0.260870 | 0.25 | 0.217391 | 0.217391 |
| 4 | 0.00 | 0.134155 | 0.340420 | 0.113347 | 0.281356 | 0.112428 | 0.526400 | 0.954262 | 0.452299 | 0.365292 | ... | 0.073058 | 0.182646 | 0.182646 | 0.174705 | 0.1 | 0.1 | 0.095652 | 0.25 | 0.239130 | 0.239130 |
5 rows × 105 columns
df_crossing.columns
Index(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH',
...
'DAYS_ID_PUBLISH*CNT_FAM_MEMBERS',
'DAYS_ID_PUBLISH*REGION_RATING_CLIENT',
'DAYS_ID_PUBLISH*REGION_RATING_CLIENT_W_CITY',
'DAYS_ID_PUBLISH*HOUR_APPR_PROCESS_START',
'CNT_FAM_MEMBERS*REGION_RATING_CLIENT',
'CNT_FAM_MEMBERS*REGION_RATING_CLIENT_W_CITY',
'CNT_FAM_MEMBERS*HOUR_APPR_PROCESS_START',
'REGION_RATING_CLIENT*REGION_RATING_CLIENT_W_CITY',
'REGION_RATING_CLIENT*HOUR_APPR_PROCESS_START',
'REGION_RATING_CLIENT_W_CITY*HOUR_APPR_PROCESS_START'],
dtype='object', length=105)
df_crossing["AMT_CREDIT*AMT_ANNUITY"]
0 0.064269
1 0.028821
2 0.007541
3 0.004563
4 0.038585
...
4358 0.015585
4359 0.124789
4360 0.049024
4361 0.001447
4362 0.006783
Name: AMT_CREDIT*AMT_ANNUITY, Length: 4363, dtype: float64
sns.scatterplot(data=df_crossing, x="AMT_INCOME_TOTAL*AMT_ANNUITY", y="AMT_INCOME_TOTAL")
<AxesSubplot: xlabel='AMT_INCOME_TOTAL*AMT_ANNUITY', ylabel='AMT_INCOME_TOTAL'>
sns.set(font_scale=0.8)
sns.set_style("whitegrid")
sns.set_palette("husl")
sns.set(rc={"figure.figsize":(12,6)})
sns.heatmap(df_crossing.corr(), annot=False, cmap="coolwarm")
<AxesSubplot: >
import prince
famd = prince.FAMD(n_components=2, n_iter=3,
copy=True, check_input=True,
engine='sklearn',random_state=42)
famd = famd.fit(df)
coords = famd.row_coordinates(df)
sns.scatterplot(coords, x=0, y=1, hue=df["TARGET"])
<AxesSubplot: xlabel='0', ylabel='1'>
df.dtypes
TARGET object NAME_CONTRACT_TYPE object CODE_GENDER object FLAG_OWN_CAR object FLAG_OWN_REALTY object CNT_CHILDREN float64 AMT_INCOME_TOTAL float64 AMT_CREDIT float64 AMT_ANNUITY float64 AMT_GOODS_PRICE float64 NAME_TYPE_SUITE object NAME_INCOME_TYPE object NAME_EDUCATION_TYPE object NAME_FAMILY_STATUS object NAME_HOUSING_TYPE object REGION_POPULATION_RELATIVE float64 DAYS_BIRTH float64 DAYS_EMPLOYED float64 DAYS_REGISTRATION float64 DAYS_ID_PUBLISH float64 CNT_FAM_MEMBERS float64 REGION_RATING_CLIENT float64 REGION_RATING_CLIENT_W_CITY float64 WEEKDAY_APPR_PROCESS_START object HOUR_APPR_PROCESS_START float64 ORGANIZATION_TYPE object dtype: object
df_dummy = pd.get_dummies(df)
from sklearn.ensemble import IsolationForest
dat_iso = df_dummy
model = IsolationForest(n_estimators = 300,max_samples ='auto',contamination = "auto", max_features = 1.0)
model.fit(dat_iso)
scores = model.decision_function(dat_iso)
anomaly = model.predict(dat_iso)
dat_iso['scores'] = scores
dat_iso['anomaly'] = anomaly
anomaly = dat_iso.loc[dat_iso['anomaly'] == -1]
anomaly_index = list(dat_iso.index)
dat_iso_drop = dat_iso[dat_iso.anomaly == 1]
dat_iso_drop.head()
dat_iso_drop.drop(columns=['scores', 'anomaly'], inplace= True)
dat_iso_drop.head()
/var/folders/sv/npxlc_k53696tn8hryg5dx5w0000gn/T/ipykernel_4688/3678961677.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
| CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | ... | ORGANIZATION_TYPE_Trade: type 2 | ORGANIZATION_TYPE_Trade: type 3 | ORGANIZATION_TYPE_Trade: type 4 | ORGANIZATION_TYPE_Trade: type 6 | ORGANIZATION_TYPE_Trade: type 7 | ORGANIZATION_TYPE_Transport: type 1 | ORGANIZATION_TYPE_Transport: type 2 | ORGANIZATION_TYPE_Transport: type 3 | ORGANIZATION_TYPE_Transport: type 4 | ORGANIZATION_TYPE_University | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00 | 0.315537 | 0.375729 | 0.171052 | 0.322034 | 0.193553 | 0.490779 | 0.431077 | 0.760757 | 0.979053 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0.50 | 0.058864 | 0.224136 | 0.128588 | 0.152542 | 0.489878 | 0.851422 | 0.799697 | 0.876169 | 0.517403 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0.25 | 0.058864 | 0.101356 | 0.074404 | 0.067797 | 0.080195 | 0.960873 | 0.964608 | 0.850582 | 0.870770 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0.50 | 0.058864 | 0.066873 | 0.068231 | 0.059322 | 0.076068 | 0.817639 | 0.883162 | 0.725603 | 0.882050 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 0.00 | 0.134155 | 0.340420 | 0.113347 | 0.281356 | 0.112428 | 0.526400 | 0.954262 | 0.452299 | 0.365292 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 106 columns
df_dummy.shape
(4363, 108)
dat_iso_drop.shape
(4358, 106)
num_vars = df.select_dtypes(include=['float', 'int']).columns.tolist()
# Plot each numerical variable
for var in num_vars:
fig = px.box(dat_iso_drop, x=var)
fig.show()